1 Introduction

In this script, we go through all the pre-registered proposed analyses. As a reminder, the main research questions where as follows:

  1. To what extent does infants’ preference for IDS as measured in a laboratory setting predict their vocabulary at 18 and 24 months?
  2. Does the relation between IDS preference and vocabulary size change over development?
  3. Are there systematic differences in the strength of this relationship across the language communities in our sample?

Here we present the main “sample theory based” analyses (also known as frequentist), separately on the North American and UK samples in parallel to answer our first two research questions, then together to answer our third research question. In the next section (03b) provide additional Bayesian statistics where a null effect was found, as specified in the pre-registration.

TODO: Delete any libraries that are solely for the Bayes analysis and can be deleted here.

# Library imports, general settings ==============
library(tidyverse); library(egg)
library(knitr)
library(lme4); library(lmerTest); library(simr)
# As in our discussion with Mike, we will use lmerTest for calculating p values
# in the mixed-effects models (noted as a deviation) 
# library(brms); library(rstan)
# library(future)
# plan(multicore, workers = 8) # Swap to multiprocess if running from Rstudio
library(lattice)
library(effects)
library(sjPlot)
library(robustlmm)
library(car)

# Load model comparison functions
source("helper/lrtests.R")

# Deal with package priority issues
select <- dplyr::select

theme_set(theme_bw(base_size = 10))
options("future" = T)
#knitr::opts_chunk$set(cache = TRUE)

print(sessionInfo()) #listing all info about R and packages info
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] furrr_0.2.1          future.apply_1.7.0   future_1.21.0       
##  [4] bridgesampling_1.0-0 rstan_2.21.2         StanHeaders_2.21.0-7
##  [7] brms_2.14.4          Rcpp_1.0.5           car_3.0-10          
## [10] robustlmm_2.4-4      sjPlot_2.8.7         effects_4.2-0       
## [13] carData_3.0-4        lattice_0.20-41      simr_1.0.5          
## [16] lmerTest_3.1-3       lme4_1.1-26          Matrix_1.3-2        
## [19] knitr_1.30           egg_0.4.5            gridExtra_2.3       
## [22] forcats_0.5.0        stringr_1.4.0        dplyr_1.0.2         
## [25] purrr_0.3.4          readr_1.4.0          tidyr_1.1.2         
## [28] tibble_3.0.4         ggplot2_3.3.3        tidyverse_1.3.0     
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0    htmlwidgets_1.5.3   grid_4.0.3         
##   [4] munsell_0.5.0       codetools_0.2-18    effectsize_0.4.1   
##   [7] statmod_1.4.35      DT_0.17             miniUI_0.1.1.1     
##  [10] withr_2.3.0         Brobdingnag_1.2-6   colorspace_2.0-0   
##  [13] rstudioapi_0.13     stats4_4.0.3        robustbase_0.93-7  
##  [16] listenv_0.8.0       bayesplot_1.8.0     emmeans_1.5.3      
##  [19] RLRsim_3.1-6        parallelly_1.23.0   coda_0.19-4        
##  [22] vctrs_0.3.6         generics_0.1.0      TH.data_1.0-10     
##  [25] xfun_0.20           R6_2.5.0            markdown_1.1       
##  [28] gamm4_0.2-6         projpred_2.0.2      assertthat_0.2.1   
##  [31] promises_1.1.1      scales_1.1.1        multcomp_1.4-15    
##  [34] nnet_7.3-14         gtable_0.3.0        globals_0.14.0     
##  [37] processx_3.4.5      sandwich_3.0-0      rlang_0.4.10       
##  [40] splines_4.0.3       broom_0.7.3         inline_0.3.17      
##  [43] yaml_2.2.1          reshape2_1.4.4      abind_1.4-5        
##  [46] modelr_0.1.8        threejs_0.3.3       crosstalk_1.1.1    
##  [49] backports_1.2.1     httpuv_1.5.5        rsconnect_0.8.16   
##  [52] tools_4.0.3         ellipsis_0.3.1      ggridges_0.5.3     
##  [55] plyr_1.8.6          base64enc_0.1-3     ps_1.5.0           
##  [58] prettyunits_1.1.1   zoo_1.8-8           haven_2.3.1        
##  [61] fs_1.5.0            survey_4.0          magrittr_2.0.1     
##  [64] data.table_1.13.6   openxlsx_4.2.3      colourpicker_1.1.0 
##  [67] reprex_0.3.0        mvtnorm_1.1-1       sjmisc_2.8.6       
##  [70] matrixStats_0.57.0  hms_1.0.0           shinyjs_2.0.0      
##  [73] mime_0.9            evaluate_0.14       xtable_1.8-4       
##  [76] shinystan_2.5.0     pbkrtest_0.5-0.1    rio_0.5.16         
##  [79] sjstats_0.18.1      readxl_1.3.1        ggeffects_1.0.1    
##  [82] rstantools_2.1.1    compiler_4.0.3      V8_3.4.0           
##  [85] crayon_1.3.4        minqa_1.2.4         htmltools_0.5.1    
##  [88] mgcv_1.8-33         later_1.1.0.1       RcppParallel_5.0.2 
##  [91] lubridate_1.7.9.2   DBI_1.1.0           sjlabelled_1.1.7   
##  [94] dbplyr_1.4.2        MASS_7.3-53         boot_1.3-25        
##  [97] cli_2.2.0           mitools_2.4         parallel_4.0.3     
## [100] insight_0.11.1      igraph_1.2.6        pkgconfig_2.0.3    
## [103] numDeriv_2016.8-1.1 foreign_0.8-81      binom_1.1-1        
## [106] xml2_1.3.2          dygraphs_1.1.1.6    estimability_1.3   
## [109] rvest_0.3.6         callr_3.5.1         digest_0.6.27      
## [112] parameters_0.10.1   fastGHQuad_1.0      rmarkdown_2.6      
## [115] cellranger_1.1.0    curl_4.3            shiny_1.5.0        
## [118] gtools_3.8.2        nloptr_1.2.2.2      lifecycle_0.2.0    
## [121] nlme_3.1-151        jsonlite_1.7.2      fansi_0.4.1        
## [124] pillar_1.4.7        loo_2.4.1           fastmap_1.0.1      
## [127] httr_1.4.2          plotrix_3.7-8       DEoptimR_1.0-8     
## [130] pkgbuild_1.2.0      survival_3.2-7      glue_1.4.2         
## [133] xts_0.12.1          bayestestR_0.8.0    zip_2.1.1          
## [136] shinythemes_1.1.2   iterators_1.0.13    stringi_1.5.3      
## [139] performance_0.6.1
# Read data ======================================
col_types <- cols(
  labid = col_factor(),
  subid = col_factor(),
  subid_unique = col_factor(),
  CDI.form = col_factor(),
  CDI.nwords = col_integer(),
  CDI.prop = col_number(),
  CDI.agerange = col_factor(),
  CDI.agedays = col_integer(),
  CDI.agemin = col_integer(),
  CDI.agemax = col_integer(),
  vocab_nwords = col_integer(),
  standardized.score.CDI = col_character(),
  standardized.score.CDI.num = col_number(),
  IDS_pref = col_number(),
  language = col_factor(),
  language_zone = col_factor(),
  CDI.error = col_logical(),
  Notes = col_character(),
  trial_order = col_factor(),
  method = col_factor(),
  age_days = col_integer(),
  age_mo = col_number(),
  age_group = col_factor(),
  nae = col_logical(),
  gender = col_factor(),
  second_session = col_logical()
)
data.total <- read_csv("data/02b_processed.csv", col_types = col_types)
# TODO: add saved results -Not sure what is wanted here. Can this note be deleted???

Before moving on with the analysis, we have to ready the data by (a) checking for colinearity between z_age_months and CDI.z_age_months and correcting this if necessary, and (b) setting up the contrasts described in our data analysis.

1.1 Colinearity check

First, we run a Kappa test on the possibility of colinearity between z_age_months and CDI.z_age_months.

# Run kappa test
k.age_months <- model.matrix(~ z_age_months + CDI.z_age_months, data = data.total) %>%
  kappa(exact = T)

With a value of 3.2587054, we do not have a colinearity issue and can proceed with the data analysis as planned (The criteria of indicating colinearity is that kappa > 10).

1.2 Contrast Setups

We need gender as an effect-coded factor, and method as a deviation-coded factor. This is achieved in R by using the contr.sum() function with the number of levels for each factor. Notably, when subsetting the UK sample, only two levels of method out of the three in total were left.

# Set contrasts on the total dataset =============
contrasts(data.total$gender) <- contr.sum(2)
contrasts(data.total$method) <- contr.sum(3)

# Create sub-datasets, with contrasts ============
## NAE
data.nae <- data.total %>% subset(language_zone == "NAE") %>% droplevels()
contrasts(data.nae$gender) <- contr.sum(2)
contrasts(data.nae$method) <- contr.sum(3)

## UK (combined-age and separate 18/24 months)

data.uk <- data.total %>% subset(language_zone == "British") %>% droplevels()
contrasts(data.uk$gender) <- contr.sum(2)
contrasts(data.uk$method) <- contr.sum(2) #note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels


data.uk.18 <- data.total %>% subset(language_zone == "British" & CDI.agerange == 
                                "18") %>% droplevels()
contrasts(data.uk.18$gender) <- contr.sum(2)
contrasts(data.uk.18$method) <- contr.sum(2) #note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels

data.uk.24 <- data.total %>% subset(language_zone == "British" & CDI.agerange == 
                                "24") %>% droplevels()
contrasts(data.uk.24$gender) <- contr.sum(2)
contrasts(data.uk.24$method) <- contr.sum(2) #note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels



## Other
data.other <- data.total %>% subset(language_zone == "Other") %>% droplevels()
contrasts(data.other$gender) <- contr.sum(2)
contrasts(data.other$method) <- contr.sum(3)

2 Descriptive Statistics

We first assess the amount of data we have overall per condition and their shape overall.

data.total %>%
  group_by(language_zone, CDI.agerange, method, gender) %>%
  summarise(N = n(), age = mean(CDI.agedays), sd = sd(CDI.agedays)) %>%
  kable()
## `summarise()` regrouping output by 'language_zone', 'CDI.agerange', 'method' (override with `.groups` argument)
language_zone CDI.agerange method gender N age sd
British 18 singlescreen F 21 548.7619 9.065896
British 18 singlescreen M 18 548.2778 9.448346
British 18 eyetracking F 8 546.6250 7.558108
British 18 eyetracking M 10 549.5000 8.934950
British 24 singlescreen F 12 734.6667 9.345230
British 24 singlescreen M 6 733.8333 4.400758
British 24 eyetracking F 12 729.2500 9.333469
British 24 eyetracking M 5 737.8000 8.228001
Other 18 singlescreen F 10 542.6000 6.586181
Other 18 singlescreen M 12 545.4167 5.567084
Other 18 eyetracking F 18 548.1667 6.862087
Other 18 eyetracking M 21 551.7143 6.341474
Other 18 hpp F 32 545.2500 6.530326
Other 18 hpp M 27 550.5556 7.890273
Other 24 singlescreen F 8 729.0000 7.131419
Other 24 singlescreen M 10 726.7000 4.372896
Other 24 eyetracking F 26 730.5385 5.623030
Other 24 eyetracking M 23 730.1304 5.504580
Other 24 hpp F 31 729.0645 9.312842
Other 24 hpp M 25 727.4800 7.665507
NAE 18 singlescreen F 19 554.9474 20.780530
NAE 18 singlescreen M 14 581.2143 24.925052
NAE 18 eyetracking F 1 549.0000 NA
NAE 18 hpp F 64 557.9375 22.801333
NAE 18 hpp M 66 556.1667 22.762035
NAE 24 singlescreen F 23 737.1739 26.604258
NAE 24 singlescreen M 20 739.6000 21.352678
NAE 24 eyetracking F 2 756.5000 34.648232
NAE 24 eyetracking M 1 745.0000 NA
NAE 24 hpp F 58 731.6897 28.149480
NAE 24 hpp M 65 726.2769 27.133068

More detailed information about Descriptive Statistics

#number of lab
data.total %>% 
  select(labid, language_zone) %>% 
  unique() %>% 
  group_by(language_zone) %>% 
  count()
## # A tibble: 3 x 2
## # Groups:   language_zone [3]
##   language_zone     n
##   <fct>         <int>
## 1 British           4
## 2 Other             8
## 3 NAE               9
data.total %>% 
  group_by(language_zone, CDI.agerange) %>% 
  summarize(N = n())
## `summarise()` regrouping output by 'language_zone' (override with `.groups` argument)
## # A tibble: 6 x 3
## # Groups:   language_zone [3]
##   language_zone CDI.agerange     N
##   <fct>         <fct>        <int>
## 1 British       18              57
## 2 British       24              35
## 3 Other         18             120
## 4 Other         24             123
## 5 NAE           18             164
## 6 NAE           24             169
# age range in each age group and language zone
data.total %>% 
  select(subid, language_zone, CDI.agedays, CDI.agerange) %>% 
  unique() %>% 
  group_by(language_zone, CDI.agerange) %>% 
  summarize(age_min = (min(CDI.agedays)/365.25*12),
            age_max = (max(CDI.agedays)/365.25*12))
## `summarise()` regrouping output by 'language_zone' (override with `.groups` argument)
## # A tibble: 6 x 4
## # Groups:   language_zone [3]
##   language_zone CDI.agerange age_min age_max
##   <fct>         <fct>          <dbl>   <dbl>
## 1 British       18              17.6    18.5
## 2 British       24              23.5    24.5
## 3 Other         18              17.5    18.5
## 4 Other         24              23.5    24.5
## 5 NAE           18              16.1    20.0
## 6 NAE           24              22.1    25.9

We then assess the data per lab in terms of sample size and CDI score (vocabulary size, for consistency between language zones).

by_lab <- data.total %>%
  group_by(labid, language_zone, CDI.agerange) %>%
  mutate(tested = n_distinct(subid_unique)) %>%
  select(labid, language_zone, CDI.agerange, tested, vocab_nwords) %>%
  nest(scores = vocab_nwords) %>%
  mutate(model = map(scores, ~ lm(vocab_nwords ~ 1, data = .x)),
         ci = map(model, confint)) %>%
  transmute(tested = tested,
            mean = map_dbl(model, ~ coefficients(.x)[[1]]),
            ci_lower = map_dbl(ci, 1),
            ci_upper = map_dbl(ci, 2)) %>%
  arrange(language_zone) %>%
  rownames_to_column()

ggplot(by_lab,
       aes(x = labid, colour = language_zone,
           y = mean, ymin = ci_lower, ymax = ci_upper)) + 
  geom_linerange() + 
  geom_point(aes(size = tested)) + 
  facet_grid(cols = vars(CDI.agerange), rows = vars(fct_relevel(language_zone, "British", "NAE", "Other")), scales = "free", space ="free_y") + coord_flip(ylim = c(0, 500)) +
  xlab("Lab") + ylab("Vocabulary size") +
  scale_colour_brewer(palette = "Dark2", name = "Language zone",
                      breaks = c("British", "NAE", "Other")) +
  scale_size_continuous(name = "N", breaks = function(x) c(min(x), mean(x), max(x))) +
  theme(legend.position = "bottom")

3 Sample Theory Based Statistics

3.1 Simple Correlation

First, we want to assess quickly if there is a direct correlation between IDS preference and CDI score, computing a Pearson#’s product-moment correlation. We use standardized CDI scores for the North American sample, and raw scores for the British sample. Since CDI grows with age, we run the British sample separately for 18 and 24 months.

# Statistics =====================================
## North American Sample
test.pearson.nae <- cor.test(data.nae$IDS_pref,
                             data.nae$z_standardized_CDI,
                             alternative = "two.sided", method = "pearson")

test.pearson.nae
## 
##  Pearson's product-moment correlation
## 
## data:  data.nae$IDS_pref and data.nae$z_standardized_CDI
## t = -0.91847, df = 305, p-value = 0.3591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.16349833  0.05977293
## sample estimates:
##         cor 
## -0.05251901
## UK Sample
test.pearson.uk.18 <- cor.test(data.uk.18$IDS_pref,
                            data.uk.18$z_vocab_nwords,
                            alternative = "two.sided", method = "pearson")

test.pearson.uk.18
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.18$IDS_pref and data.uk.18$z_vocab_nwords
## t = -0.026578, df = 55, p-value = 0.9789
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2639050  0.2572241
## sample estimates:
##          cor 
## -0.003583759
test.pearson.uk.24 <- cor.test(data.uk.24$IDS_pref,
                            data.uk.24$z_vocab_nwords,
                            alternative = "two.sided", method = "pearson")

test.pearson.uk.24
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.24$IDS_pref and data.uk.24$z_vocab_nwords
## t = 0.45131, df = 33, p-value = 0.6547
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2617574  0.4010989
## sample estimates:
##        cor 
## 0.07832108

Plots for correlation

## North American Sample
### Get correlation value for annotation
cor_text <- "paste(italic(R)^2, \" =\")"
cor_value <- round(test.pearson.nae$estimate, 3)

### Build plot
plot.pearson.nae <- data.nae %>%
  ggplot(aes(x = IDS_pref,
             y = standardized.score.CDI.num)) +
  xlab("IDS preference") + ylab("Standardized CDI score") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text", x = -.9, y = 51, parse = T, size = 4,
           label = paste(cor_text, cor_value, sep = "~"))

## UK Sample
cor_value <- round(test.pearson.uk.18$estimate, 3)
plot.pearson.uk.18 <- data.uk.18 %>%
  ggplot(aes(x = IDS_pref,
             y = vocab_nwords)) +
  xlab("IDS preference") + ylab("Vocabulary size (in words)") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text", x = .8, y = 150, parse = T, size = 4,
           label = paste(cor_text, cor_value, sep = "~"))

cor_value <- round(test.pearson.uk.24$estimate, 3)
plot.pearson.uk.24 <- data.uk.24 %>%
  ggplot(aes(x = IDS_pref,
             y = vocab_nwords)) +
  xlab("IDS preference") + ylab("Vocabulary size (in words)") +
  geom_point() +
  geom_smooth(method = lm) +
  annotate("text", x = .8, y = 150, parse = T, size = 4,
           label = paste(cor_text, cor_value, sep = "~"))

# Global plot
plot.pearson <- ggarrange(plot.pearson.nae, plot.pearson.uk.18, plot.pearson.uk.24, ncol = 3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 26 rows containing non-finite values (stat_smooth).
## Warning: Removed 26 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

plot.pearson

#TODO: These plots need to be cleaned up!

ggsave("plots/corr_plot.pdf", plot.pearson,
       units = "mm", width = 180, height = 100, dpi = 1000)

We see no obvious direct link between IDS prefernce and CDI score here. However, an effect might appear once we take into account various factors that might interact with IDS preference and/or CDI score. We can also first enhance these plots with information about the age group at which infants were tested (18- or 24-month-old) for the NAE sample, using vocabulary size to better compare the NAE and UK samples.

plot.age_group <- data.total %>%
  subset(language_zone != "Other") %>%
  droplevels() %>%
  ggplot(aes(x = IDS_pref,
             y = vocab_nwords,
             colour = CDI.agerange)) +
  facet_wrap(vars(language_zone),
             labeller = as_labeller(c("British" = "UK samples",
                                      "NAE" = "North Amercian samples"))) +
  xlab("Standardized IDS prefence score") + ylab("Vocabulary size (in words)") + theme(legend.position = "top") +
  geom_point() +
  geom_smooth(method = lm) +
  scale_colour_brewer(palette = "Dark2", name = "Age group",
                      breaks = c("18", "24"),
                      labels = c("18mo", "24m"))
ggsave("plots/scatter_age.pdf", plot.age_group,
       units = "mm", width = 180, height = 100, dpi = 1000)
## `geom_smooth()` using formula 'y ~ x'
(plot.age_group)
## `geom_smooth()` using formula 'y ~ x'

3.2 Mixed-Effects Model by Language Zone

Here, we run a mixed-effects model including only theoretically motivated effects, as described in the pre-registration. We start with the full model bellow, simplifying the random effects structure until it converges.

3.2.1 NAE full model

# Run models =====================================
## NAE

data.nae$centered_IDS_pref <- scale(data.nae$IDS_pref, center = T, scale = F)

lmer.full.nae <- lmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
                        z_age_months + method + centered_IDS_pref +
                        centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
                        (1 | labid) + (1 | subid_unique),
                      data = data.nae)

summary(lmer.full.nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months +  
##     method + centered_IDS_pref + centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months +  
##     centered_IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
##    Data: data.nae
## 
## REML criterion at convergence: 2795.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.19986 -0.50964 -0.00062  0.40872  2.40426 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 554.4    23.55   
##  labid        (Intercept)  34.8     5.90   
##  Residual                 219.1    14.80   
## Number of obs: 307, groups:  subid_unique, 211; labid, 8
## 
## Fixed effects:
##                                    Estimate Std. Error       df t value
## (Intercept)                         31.3285     6.4558  53.8578   4.853
## CDI.z_age_months                     1.1559     0.3412 118.3835   3.388
## gender1                             -1.4068     1.8876 194.1693  -0.745
## z_age_months                        -0.1076     0.6285  70.2700  -0.171
## method1                              6.4913     6.5640 127.7625   0.989
## method2                            -14.9012    11.8935 179.1258  -1.253
## centered_IDS_pref                  -33.9167    24.7927 208.6284  -1.368
## method1:centered_IDS_pref           26.2532    25.4357 208.7560   1.032
## method2:centered_IDS_pref          -56.6834    49.5010 207.8344  -1.145
## CDI.z_age_months:centered_IDS_pref   0.5553     1.0468 120.4990   0.530
## z_age_months:centered_IDS_pref       2.1822     2.0830 190.6377   1.048
##                                    Pr(>|t|)    
## (Intercept)                        1.08e-05 ***
## CDI.z_age_months                   0.000957 ***
## gender1                            0.456997    
## z_age_months                       0.864526    
## method1                            0.324572    
## method2                            0.211878    
## centered_IDS_pref                  0.172780    
## method1:centered_IDS_pref          0.303201    
## method2:centered_IDS_pref          0.253486    
## CDI.z_age_months:centered_IDS_pref 0.596748    
## z_age_months:centered_IDS_pref     0.296136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 methd2 c_IDS_ m1:_ID m2:_ID
## CDI.z_g_mnt -0.059                                                         
## gender1     -0.030  0.021                                                  
## z_age_mnths  0.036 -0.036  -0.019                                          
## method1     -0.724  0.000   0.021  0.102                                   
## method2      0.883 -0.029  -0.034  0.020 -0.881                            
## cntrd_IDS_p  0.357  0.006   0.038 -0.040 -0.347  0.384                     
## mthd1:_IDS_ -0.335 -0.027  -0.027  0.017  0.360 -0.383 -0.926              
## mthd2:_IDS_  0.349  0.020   0.061 -0.007 -0.360  0.390  0.965 -0.971       
## CDI.__:_IDS -0.011  0.014  -0.019 -0.019 -0.028  0.008 -0.048  0.026 -0.045
## z_g_m:_IDS_ -0.041  0.025   0.102  0.046 -0.037  0.022  0.022 -0.087  0.132
##             CDI.__:
## CDI.z_g_mnt        
## gender1            
## z_age_mnths        
## method1            
## method2            
## cntrd_IDS_p        
## mthd1:_IDS_        
## mthd2:_IDS_        
## CDI.__:_IDS        
## z_g_m:_IDS_ -0.068
# robust_lmer.full.nae <- robustlmm::rlmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
#                         z_age_months + method + centered_IDS_pref +
#                         centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months +
#                         (1 | labid),
#                       data = data.nae)
# 
# 
# summary(robust_lmer.full.nae) #this model is used to see if we can meet some statistical assumption better but we decided to use the original model as the inferential statistical results are consistent

full.nae_pvalue <- anova(lmer.full.nae) %>% 
  as_tibble(rownames = "Parameter") #this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
#==========

3.2.1.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
#First, check linearity
# data.nae$resid <- residuals(lmer.full.nae)
# 
# plot(data.nae$resid, data.nae$standardized.score.CDI)

#Second, check normality
plot_model(lmer.full.nae, type = 'diag') ## we do have right-skewed normality of residuals
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.18
## Current Matrix version is 1.3.2
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

#Third, check autocorrelation
re_run_lme.full.nae <- nlme::lme(standardized.score.CDI.num ~ CDI.z_age_months + gender +
                        z_age_months + method + centered_IDS_pref +
                        centered_IDS_pref:method + centered_IDS_pref:CDI.z_age_months + centered_IDS_pref:z_age_months, random = ~1 | labid,
                        method = "REML",
                      data = data.nae, na.action = na.exclude)

plot(nlme::ACF(re_run_lme.full.nae, resType = "normalized")) #there is no sign for autocorrelation

#Lastly, check multi-collinearity
car::vif(lmer.full.nae) #we do see a multicollineartiy for the IDS preference variable, even though we have centered the IDS preference score. It is probably related to the number the participating labs (as this is the group level that we are controlling) and how we entered interaction between IDS preference and other variables (that lack variability in the current sample). We need to keep IDS preference in the model as exploring the relationship between IDS preference and CDI score is the key research question in the paper.
##                                         GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months                    1.008664  1        1.004323
## gender                              1.038941  1        1.019285
## z_age_months                        1.098275  1        1.047986
## method                              1.290505  2        1.065835
## centered_IDS_pref                  19.218063  1        4.383841
## method:centered_IDS_pref           20.770091  2        2.134812
## CDI.z_age_months:centered_IDS_pref  1.014148  1        1.007049
## z_age_months:centered_IDS_pref      1.316334  1        1.147316

3.2.2 UK full model

lmer.full.uk <- lmer(vocab_nwords ~ CDI.z_age_months + gender +
                       z_age_months + method + IDS_pref +
                       IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
                       #(1 | labid) + 
                       (1 | subid_unique),
                     data = data.uk) 

summary(lmer.full.uk)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: vocab_nwords ~ CDI.z_age_months + gender + z_age_months + method +  
##     IDS_pref + IDS_pref:method + IDS_pref:CDI.z_age_months +  
##     IDS_pref:z_age_months + (1 | subid_unique)
##    Data: data.uk
## 
## REML criterion at convergence: 1001.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.61591 -0.38896 -0.02037  0.37307  1.93394 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 4834     69.53   
##  Residual                 2209     47.00   
## Number of obs: 92, groups:  subid_unique, 76
## 
## Fixed effects:
##                           Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)                178.534     10.936  67.996  16.326  < 2e-16 ***
## CDI.z_age_months            32.357      2.428  30.360  13.326 3.19e-14 ***
## gender1                     10.142     10.156  67.389   0.999   0.3215    
## z_age_months                -7.226      3.621  71.967  -1.996   0.0498 *  
## method1                     -8.451     13.139  66.946  -0.643   0.5223    
## IDS_pref                    25.907     27.203  72.778   0.952   0.3441    
## method1:IDS_pref           -25.499     29.465  72.479  -0.865   0.3897    
## CDI.z_age_months:IDS_pref   10.598      7.059  34.815   1.501   0.1423    
## z_age_months:IDS_pref       -8.352      8.978  78.184  -0.930   0.3551    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 IDS_pr m1:IDS CDI.__:
## CDI.z_g_mnt  0.183                                                   
## gender1     -0.024 -0.091                                            
## z_age_mnths -0.021 -0.210  -0.037                                    
## method1     -0.312 -0.067  -0.165  0.544                             
## IDS_pref    -0.336 -0.060  -0.068  0.042  0.184                      
## mthd1:IDS_p  0.167  0.111   0.056 -0.146 -0.306 -0.104               
## CDI.__:IDS_ -0.095 -0.293   0.019  0.106  0.129  0.233 -0.138        
## z_g_mn:IDS_ -0.005  0.110  -0.214 -0.387 -0.136 -0.187  0.425 -0.345
full.uk_pvalue <- anova(lmer.full.uk) %>% 
  as_tibble(rownames = "Parameter") #this gives us the Type III p values

#==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# (1| labid) please note the variance was very little and reported as zero in the results, we needed to remove this random effect

3.2.2.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
#First, check linearity. The plot looks linear
data.uk$resid <- residuals(lmer.full.uk)
 
plot(data.uk$resid, data.uk$vocab_nwords)

#Second, check normality
plot_model(lmer.full.uk, type = 'diag') ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

#Third, check autocorrelation
re_run_lme.full.uk <- nlme::lme(vocab_nwords ~ CDI.z_age_months + gender +
                       z_age_months + method + IDS_pref +
                       IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months, random = ~1 | labid,
                        method = "REML",
                      data = data.nae, na.action = na.exclude)

plot(nlme::ACF(re_run_lme.full.uk, resType = "normalized")) #there is no sign for autocorrelation

#Lastly, check multi-collinearity
car::vif(lmer.full.uk) #no problem for multicollinearlity
##          CDI.z_age_months                    gender              z_age_months 
##                  1.187307                  1.141095                  1.913758 
##                    method                  IDS_pref           method:IDS_pref 
##                  1.762512                  1.127320                  1.445690 
## CDI.z_age_months:IDS_pref     z_age_months:IDS_pref 
##                  1.308464                  1.899772

We now want to check the statistical power of significant effects, and discard any models with significant effects that do not reach 80% power. This however leads to too many warnings of singularity issues on the model updates inherent to the simr power simulations, hence we cannot obtain satisfactory power estimates as pre-registered.

AST: Note that we don’t have any IV(s) that turned out to be significant in the Full NAE model. So we won’t run the power analysis check. For the UK full model, there are two statistically significant IV: CDI_age and gender. The post hoc power check suggested that we have high power in detecting the effect of CDI_age but not gender. Note that gender has a smaller effect size to begin with, so this may partially explain why we have less power in detecting it in the model. As there can be a number of different factors that determines the posthoc power, we decided not to remove gender in the model based on posthoc power analysis check.

check_pwr_uk_cdi_age <- simr::powerSim(lmer.full.uk, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) #specify that Gender is the ixed effect that we are looking into

check_pwr_uk_cdi_age

check_pwr_uk_gender <- simr::powerSim(lmer.full.uk, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) #specify that Gender is the ixed effect that we are looking into

check_pwr_uk_gender

3.2.3 Combined Sample

For this combined analysis, we first need to restrain the age range for the NAE sample (previously ±2 months, now ±1.5 months). [Note: This should be +/- 0.5 months…]

# Create dataset with British and NAE only
data.uk_nae <- data.total %>%
  subset(language_zone %in% c("British", "NAE")) %>%
  mutate(CDI.agemin = ifelse(language_zone == "NAE",
                             CDI.agemin + round(.5*365.25/12),
                             CDI.agemin),
         CDI.agemax = ifelse(language_zone == "NAE",
                             CDI.agemax - round(.5*365.25/12),
                             CDI.agemax)) %>%
  subset(!(CDI.agedays < CDI.agemin | CDI.agedays > CDI.agemax)) %>%
  droplevels()
# Create contrasts for analysis
contrasts(data.uk_nae$gender) <- contr.sum(2)
contrasts(data.uk_nae$method) <- contr.sum(3)
contrasts(data.uk_nae$language_zone) <- contr.sum(2)

We can then run the planned combined analysis adding the main effect and interactions of language_zone.

lmer.full.uk_nae <- lmer(CDI.prop ~ CDI.z_age_months + language_zone + gender +
                           z_age_months + method + IDS_pref + IDS_pref:language_zone +
                           IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months +
                           (1 | labid) + (1 | subid_unique),
                         data = data.uk_nae)

summary(lmer.full.uk_nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDI.prop ~ CDI.z_age_months + language_zone + gender + z_age_months +  
##     method + IDS_pref + IDS_pref:language_zone + IDS_pref:method +  
##     IDS_pref:CDI.z_age_months + IDS_pref:z_age_months + (1 |  
##     labid) + (1 | subid_unique)
##    Data: data.uk_nae
## 
## REML criterion at convergence: -97.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.89103 -0.50634 -0.00619  0.44106  2.47651 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 0.020831 0.14433 
##  labid        (Intercept) 0.001502 0.03875 
##  Residual                 0.020069 0.14166 
## Number of obs: 397, groups:  subid_unique, 290; labid, 13
## 
## Fixed effects:
##                             Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)                 0.322768   0.020166  13.299171  16.006 4.49e-10 ***
## CDI.z_age_months            0.051004   0.003075 185.198888  16.586  < 2e-16 ***
## language_zone1              0.092405   0.024873  11.320472   3.715  0.00325 ** 
## gender1                     0.029054   0.011460 269.840430   2.535  0.01180 *  
## z_age_months               -0.003430   0.004184 160.120357  -0.820  0.41365    
## method1                    -0.005645   0.025656  19.894754  -0.220  0.82810    
## method2                    -0.005063   0.040255  18.277056  -0.126  0.90128    
## IDS_pref                    0.007971   0.040343 296.222646   0.198  0.84351    
## language_zone1:IDS_pref     0.029449   0.058002 307.606650   0.508  0.61201    
## method1:IDS_pref           -0.057186   0.052785 306.712106  -1.083  0.27949    
## method2:IDS_pref            0.043003   0.087955 311.318982   0.489  0.62524    
## CDI.z_age_months:IDS_pref   0.009680   0.008311 184.394430   1.165  0.24565    
## z_age_months:IDS_pref      -0.004877   0.011250 274.134058  -0.434  0.66497    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
combined.full.uk_nae_pvalue <- anova(lmer.full.uk_nae) %>% 
  as_tibble(rownames = "Parameter") #this gives us the Type III p values

#==========
# Sequentially removed random effects:
# IDS_pref:z_age_months
# IDS_pref:CDI.z_age_months
# IDS_pref:method
# IDS_pref:language_zone
# IDS_pref
# method
# z_age_months
# gender
# language_zone
# CDI.z_age_months
#==========

3.2.3.1 (Optional) Checking mixed-model assumptions

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
#First, check linearity. The plot looks linear
data.uk_nae$resid <- residuals(lmer.full.uk_nae)
 
plot(data.uk_nae$resid, data.uk_nae$CDI.prop)

#Second, check normality
plot_model(lmer.full.uk_nae, type = 'diag') ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula 'y ~ x'

#Third, check autocorrelation
re_run_lme.full.uk_nae <- nlme::lme(CDI.prop ~ CDI.z_age_months + language_zone + gender +
                           z_age_months + method + IDS_pref + IDS_pref:language_zone +
                           IDS_pref:method + IDS_pref:CDI.z_age_months + IDS_pref:z_age_months,
                           random = ~1  | labid,
                        method = "REML",
                      data = data.uk_nae, na.action = na.exclude)

plot(nlme::ACF(re_run_lme.full.uk_nae, resType = "normalized")) #there is no sign for autocorrelation

#Lastly, check multi-collinearity
car::vif(lmer.full.uk_nae) #no problem for multicollinearlity
##                               GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months          1.213821  1        1.101736
## language_zone             1.950765  1        1.396698
## gender                    1.033569  1        1.016646
## z_age_months              1.362839  1        1.167407
## method                    2.449977  2        1.251096
## IDS_pref                  1.466093  1        1.210823
## language_zone:IDS_pref    3.213972  1        1.792755
## method:IDS_pref           3.870057  2        1.402585
## CDI.z_age_months:IDS_pref 1.244461  1        1.115554
## z_age_months:IDS_pref     1.459354  1        1.208037

We then compute \(p\)-values, but leave out power estimates for those \(p\)-values as above. Again, we have a lot of singular fit issues for the power checks and decided not to remove parameters based on posthoc power analysis.

check_pwr_combined_cdi_age <- simr::powerSim(lmer.full.uk_nae, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) #specify that Gender is the ixed effect that we are looking into

check_pwr_combined_cdi_age

check_pwr_combined_lang_zone <- simr::powerSim(lmer.full.uk_nae, test = fixed("language_zone", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) #specify that Gender is the ixed effect that we are looking into

check_pwr_combined_lang_zone

check_pwr_combined_gender <- simr::powerSim(lmer.full.uk_nae, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) #specify that Gender is the ixed effect that we are looking into

check_pwr_combined_gender